import re
import time

#因为要爬取的项目是从这个https://book.jd.com/booksort.html主页面开始的，
# 因为大分类的小说和小分类的中国当代小说包括表示其种类的id都是通过请求接口来渲染
# 所以直接通过请求页面是获取不到相应的数据的所以只能破解下面这个接口
# https://pjapi.jd.com/book/sort?source=bookSort&callback=jsonp_1606487589792_60024
# 直接通过浏览器控制台中的network就能抓到这个包
# 经过测试source后面的参数是不变的
# callback后面的参数是jsonp_加时间戳_随机的四到五位数字
# 下面就用python自带的time软件生成时间戳来进行请求
# 再加上referer参数https://book.jd.com/ 伪装成官网跳转过来就ok了
import requests
t=int(time.time()* 1000)
# url地址中的协议不能缺失
url = "https://pjapi.jd.com/book/sort?source=bookSort"
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    ,'callback':'jsonp_'+str(t)+'_60005',
    #'cookie': '__jdu=334001893; shshshfpa=f1b0bc7f-dbaa-d2cd-f865-a80223987344-1592531267; shshshfpb=z%2078BvPGlJHaBBDl07RoAag%3D%3D; areaId=17; ipLoc-djd=17-1381-50712-0; PCSYCityID=CN_420000_420100_420114; shshshfp=0cd0679c8861743fc08947ee72116395; unpl=V2_ZzNtbRECS0AmDU9WeBBVDGIDQghKXkQWdA1OAyhKXwYwUUVZclRCFnQURlRnGlgUZwQZWUVcRxNFCEdkeB5fA2AFEFlBZxBFLV0CFi9JH1c%2bbRJcQldGFHMJQlJ%2fKWwGZzMSXHJVQRB0CUZdcxtsNWAzIm1HU0EdcwB2VUsYbEczXxtURVZAETgKRFF6GFwMbwEiXHJU; CCC_SE=ADC_gMaNe2Kk4D6ZgeyriJaN%2fNJn8I4JpiZfFIih%2bhqBRuJW%2fo0bKQA8Veaiv2Xe0Z2wv26kMR%2bLJnKzvLiyDINjBrOQ2Ph7Uo4lX0qOjiGvY1vuLlvdsabRym0K1DcDQoDyYzxpXoiNehK8QaFoxTVjakLQduBOBkaa8nex1ROB6QLDQ8By9CPigXJcqyugCv9fYgFwVe5rKgiwZgkxJMVI%2fKM5rG1sOb3y7soHxzrJhfQETl%2fssJjR0ShIQ33cJvKTf4ttgt4qWoT7qG2YMURtb6PFmKZG%2bwxX%2b3OQo51e7tKhzk7tkxaK4D2cKjKEU3lSnQc6OJ3eR3Eit3iBbZwp%2byp%2byh59iWlAs6GKJbIiVa1OZeem8y7WR89F4EGDo1H58BnDYhrEziIlF9yjNExMmkdCt3Q08BAp2wN7%2fFW2weXmyQ6lMPnfUW%2bV6DQFuPv%2bRAEM%2bq09Xw5eDCn2Q03DzZsvnvZSbhOcM2I4s8bxut%2fRiAhBpDJEAfl3d2GRknKT; __jdv=122270672|baidu-search|t_262767352_baidusearch|cpc|10114070575_0_bd9db483288841ad9862049fbb22fcf5|1606294617710; __jda=122270672.334001893.1581770090.1606294581.1606294612.11; __jdc=122270672; 3AB9D23F7A4B3C9B=JK4FO3AQUUI6A7GBD3HUFCZVC47VFUVIFWTAK3WATVQNMXGMDEEH5D4NWA6O6LVFZUKYH37CGCW7P7IF4GGDNIUIMY; shshshsID=ca75346007cecbe15158312ca4f7f55b_1_1606298507754; __jdb=122270672.19.334001893|11.1606294612'
    'referer':'https://book.jd.com/'
}
response2 = requests.get(url, headers=headers)
print(response2.request.headers)
print(response2.content.decode())
# 可以运行这段代码再看请求头返回内容包含所有的分类就代表请求成功这是请求后面资源额基础

print(re.findall('\d+',"SEARCH.page(5, true)")[0])
list=re.split("&",'https://list.jd.com/list.html?cat=1713,3275,3578&page=3')
str1=str(int(re.findall('\d+',list[1])[0])+2)
print(list[0]+'&page='+str1)



